## Warning: package 'BiocGenerics' was built under R version 4.0.5
## Warning: package 'GenomeInfoDb' was built under R version 4.0.5
load("~/Documents/MiGASti/Databases/gene_matrix.RData")
metadata <- read.table("~/Documents/MiGASti/Databases/metadata2.txt")
#set rownames to Sample
row.names(metadata) <- metadata$Sample 
setwd("~/Documents/MiGASti/Databases")
#exclude samples that did not pass QC filtering
exclude <- read.table("samples2remove2.txt")
exclude <- exclude$x
genes_counts_filt = genes_counts[, !colnames(genes_counts) %in% exclude] 
#Excludes the samples from filters. 
#dim(genes_counts_filt)
metadata_filt = metadata[ !(rownames(metadata) %in% exclude), ]
length(metadata_filt)
gencode_30 = read.table("~/Documents/MiGASti/Databases/ens.geneid.gencode.v30")
colnames(gencode_30) = c("ensembl","symbol")
#order metadata and genes counts
genes_counts_ordered <- genes_counts_filt[,rownames(metadata_filt)]
#head(genes_counts_ordered)
all(rownames(metadata_filt) == colnames (genes_counts_ordered)) #TRUE

Preparing the samples for DEG

#remove uncultured samples
metadata_cultured <- metadata_filt[metadata_filt$Stimulation != "ununstim",]
#remove uncultured samples
metadata_cultured_subset <- metadata_cultured[metadata_cultured$Stimulation != "TNFa",]
#check numbers
#dim(metadata_cultured)
#check numbers per stimulation
#table(metadata_filt$Stimulation)
#select only GFM samples in metadata
metadata_GFM = metadata_cultured_subset[metadata_cultured_subset$Region=="GFM",]
#select only GFM samples in genes counts
genes_counts_GFM <- genes_counts_ordered[,metadata_GFM$Sample]
#order metadata and genes_counts
genes_counts_GFM_ordered <- genes_counts_GFM[,rownames(metadata_GFM)]
#check ordering
all(rownames(metadata_GFM) == colnames (genes_counts_GFM_ordered)) #TRUE

[1] TRUE

#check numbers
#dim(metadata_cultured)
#check numbers per stimulation
#table(metadata_filt$Stimulation)
#select only GFM samples in metadata
metadata_GFM = metadata_cultured[metadata_cultured$Region=="GFM",]
#select only GFM samples in genes counts
genes_counts_GFM <- genes_counts_ordered[,metadata_GFM$Sample]
#order metadata and genes_counts
genes_counts_GFM_ordered <- genes_counts_GFM[,rownames(metadata_GFM)]
#check ordering
all(rownames(metadata_GFM) == colnames (genes_counts_GFM_ordered)) #TRUE

[1] TRUE

#round counts; deseq2 can only handle integers
genes_counts_GFM_ordered <- round(genes_counts_GFM_ordered, digits=0)

#make sure covariate variables are the right format 
#cannot create dds object with numeric values
metadata_GFM$Donor_id <- as.factor(metadata_GFM$Donor_id)
metadata_GFM$age <- as.integer(metadata_GFM$age)
metadata_GFM$sex <- as.factor(metadata_GFM$sex)
metadata_GFM$Stimulation <- as.factor(metadata_GFM$Stimulation)
metadata_GFM$picard_pct_ribosomal_bases = scale(metadata_GFM$picard_pct_ribosomal_bases)
metadata_GFM$picard_pct_mrna_bases = scale(metadata_GFM$picard_pct_mrna_bases)
metadata_GFM$picard_pct_pf_reads_aligned = scale(metadata_GFM$picard_pct_pf_reads_aligned)
metadata_GFM$picard_pct_percent_duplication = scale(metadata_GFM$picard_pct_percent_duplication)

#adjust for: ~ age + (1|donor_id) + picard_pct_ribosomal_bases + picard_pct_mrna_bases +   picard_pct_percent_duplication + picard_pct_pf_reads_aligned 

DESeq2 of GFM samples

#createDeSEQ2 object for LPS
dds <- DESeqDataSetFromMatrix(countData = genes_counts_GFM_ordered,
                              colData = metadata_GFM,
                              design = ~ age + sex + picard_pct_ribosomal_bases + picard_pct_mrna_bases + picard_pct_percent_duplication + picard_pct_pf_reads_aligned + Stimulation) 
#variable of interest at end of the formula

#Make sure that control group is set as the reference group
dds$Stimulation <- relevel(dds$Stimulation, ref="unstim")
table(dds$Stimulation) 

unstim ATP DEX IFNy IL4 LPS R848 TNFa 38 3 9 23 7 33 17 23

#head(dds)

#filter: CPM > 1 in 50% of the samples 
keep.exp = rowSums(cpm(genes_counts_GFM_ordered) > 1) >= 0.5*ncol(genes_counts_GFM_ordered)
dds = dds[keep.exp,]

#Run differential expression 
dds <- DESeq(dds, betaPrior = FALSE)
resultsNames(dds)

[1] “Intercept” “age”
[3] “sex_m_vs_f” “picard_pct_ribosomal_bases”
[5] “picard_pct_mrna_bases” “picard_pct_percent_duplication” [7] “picard_pct_pf_reads_aligned” “Stimulation_ATP_vs_unstim”
[9] “Stimulation_DEX_vs_unstim” “Stimulation_IFNy_vs_unstim”
[11] “Stimulation_IL4_vs_unstim” “Stimulation_LPS_vs_unstim”
[13] “Stimulation_R848_vs_unstim” “Stimulation_TNFa_vs_unstim”

DESeq2: LPS vs unstim

Number of differentially expressed genes

# generate results table for LPS vs unstim
res_LPS <- results(dds, name="Stimulation_LPS_vs_unstim")
sum(res_LPS$padj < 0.05, na.rm=TRUE)

[1] 376

resOrdered_LPS <- res_LPS[order(res_LPS$pvalue),] 
resOrdered_LPS <- as.data.frame(resOrdered_LPS)

Volcano plot LPS vs unstim

head(res_LPS)

log2 fold change (MLE): Stimulation LPS vs unstim Wald test p-value: Stimulation LPS vs unstim DataFrame with 6 rows and 6 columns baseMean log2FoldChange lfcSE stat pvalue ENSG00000000003.14 21.0650 0.480442 0.326447 1.471731 0.1410936 ENSG00000000419.12 239.5074 0.122404 0.176788 0.692375 0.4887017 ENSG00000000457.14 94.7908 0.228420 0.173813 1.314172 0.1887883 ENSG00000000460.17 28.9771 -0.334744 0.192616 -1.737882 0.0822315 ENSG00000000938.13 566.1630 0.141110 0.139776 1.009547 0.3127126 ENSG00000000971.15 53.8737 0.513828 0.289637 1.774041 0.0760564 padj ENSG00000000003.14 0.611358 ENSG00000000419.12 0.853407 ENSG00000000457.14 0.661482 ENSG00000000460.17 0.498820 ENSG00000000938.13 0.762316 ENSG00000000971.15 0.486522

with(res_LPS, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-2.5,2)))
with(subset(res_LPS, padj<.05 ), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))

MA plot LPS vs unstim

#The function plotMA shows the log2 fold changes attributable to a given variable over the mean of normalized counts for all the samples in the DESeqDataSet. Points will be colored if the adjusted p value is less than 0.1. Points which fall out of the window are plotted as open triangles pointing either up or down.

plotMA(res_LPS, ylim=c(-2,2))

TOP differentially expressed genes LPS vs unstim

setDT(resOrdered_LPS, keep.rownames = "ensembl")
resOrdered_LPS <- left_join(resOrdered_LPS, gencode_30, by = "ensembl")
resOrdered_LPS_top = resOrdered_LPS[order(resOrdered_LPS$padj) ,]
setDT(resOrdered_LPS_top, keep.rownames = "ensembl")
resOrdered_LPS_top = resOrdered_LPS_top[, c("ensembl", "symbol", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
createDT(resOrdered_LPS_top)
write.table(resOrdered_LPS_top, "DEG_LPS_GFM.txt")

DESeq2: IFNy vs unstim

Number of differentially expressed genes

# generate results table for IFNy vs unstim
res_IFNy <- results(dds, name="Stimulation_IFNy_vs_unstim")
sum(res_IFNy$padj < 0.05, na.rm=TRUE)

[1] 155

resOrdered_IFNy <- res_IFNy[order(res_IFNy$pvalue),] 
resOrdered_IFNy <- as.data.frame(resOrdered_IFNy)

Volcano plot IFNy vs unstim

head(res_IFNy)

log2 fold change (MLE): Stimulation IFNy vs unstim Wald test p-value: Stimulation IFNy vs unstim DataFrame with 6 rows and 6 columns baseMean log2FoldChange lfcSE stat pvalue ENSG00000000003.14 21.0650 0.0417137 0.362275 0.115144 0.9083312 ENSG00000000419.12 239.5074 0.0265333 0.195448 0.135756 0.8920141 ENSG00000000457.14 94.7908 0.1557635 0.192311 0.809955 0.4179659 ENSG00000000460.17 28.9771 -0.2587047 0.210989 -1.226154 0.2201407 ENSG00000000938.13 566.1630 0.1368747 0.154611 0.885281 0.3760049 ENSG00000000971.15 53.8737 -0.6093795 0.326299 -1.867547 0.0618253 padj ENSG00000000003.14 NA ENSG00000000419.12 0.961819 ENSG00000000457.14 0.725413 ENSG00000000460.17 0.572282 ENSG00000000938.13 0.697828 ENSG00000000971.15 0.379810

with(res_IFNy, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-10,10)))
with(subset(res_IFNy, padj<.05 ), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))

MA plot IFNy vs unstim

plotMA(res_IFNy, ylim=c(-2,2))

TOP differentially expressed genes IFNy vs unstim

setDT(resOrdered_IFNy, keep.rownames = "ensembl")
resOrdered_IFNy <- left_join(resOrdered_IFNy, gencode_30, by = "ensembl")
resOrdered_IFNy_top = resOrdered_IFNy[order(resOrdered_IFNy$padj) ,]
resOrdered_IFNy_top = head(resOrdered_IFNy)
setDT(resOrdered_IFNy_top, keep.rownames = "ensembl")
resOrdered_IFNy_top = resOrdered_IFNy_top[, c("ensembl", "symbol", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
createDT(resOrdered_IFNy_top)
write.table(resOrdered_IFNy_top, "DEG_IFNy_GFM.txt")

DESeq2: R848 vs unstim

Number of differentially expressed genes

# generate results table for R848 vs unstim
res_R848 <- results(dds, name="Stimulation_R848_vs_unstim")
sum(res_R848$padj < 0.05, na.rm=TRUE)

[1] 31

resOrdered_R848 <- res_R848[order(res_R848$pvalue),] 
resOrdered_R848 <- as.data.frame(resOrdered_R848)

Volcano plot R848 vs unstim

head(res_R848)

log2 fold change (MLE): Stimulation R848 vs unstim Wald test p-value: Stimulation R848 vs unstim DataFrame with 6 rows and 6 columns baseMean log2FoldChange lfcSE stat pvalue ENSG00000000003.14 21.0650 0.1683922 0.396722 0.424459 0.6712312 ENSG00000000419.12 239.5074 -0.0450358 0.217607 -0.206959 0.8360419 ENSG00000000457.14 94.7908 -0.2695736 0.214465 -1.256956 0.2087696 ENSG00000000460.17 28.9771 -0.4154349 0.235702 -1.762541 0.0779779 ENSG00000000938.13 566.1630 0.1927277 0.172045 1.120214 0.2626225 ENSG00000000971.15 53.8737 -0.6725260 0.356756 -1.885117 0.0594141 padj ENSG00000000003.14 NA ENSG00000000419.12 0.953536 ENSG00000000457.14 0.603070 ENSG00000000460.17 NA ENSG00000000938.13 0.647297 ENSG00000000971.15 NA

with(res_R848, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-10,10)))
with(subset(res_R848, padj<.05 ), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))

MA plot R848 vs unstim

plotMA(res_R848, ylim=c(-2,2))

TOP differentially expressed genes R848 vs unstim

setDT(resOrdered_R848, keep.rownames = "ensembl")
resOrdered_R848 <- left_join(resOrdered_R848, gencode_30, by = "ensembl")
resOrdered_R848_top = resOrdered_R848[order(resOrdered_R848$padj) ,]
setDT(resOrdered_R848_top, keep.rownames = "ensembl")
resOrdered_R848_top = resOrdered_R848_top[, c("ensembl", "symbol", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
createDT(resOrdered_R848_top)

DESeq2: DEX vs unstim

Number of differentially expressed genes

# generate results table for DEX vs unstim
res_DEX <- results(dds, name="Stimulation_DEX_vs_unstim")
sum(res_DEX$padj < 0.05, na.rm=TRUE)

[1] 622

resOrdered_DEX <- res_DEX[order(res_DEX$pvalue),] 
resOrdered_DEX <- as.data.frame(resOrdered_DEX)

Volcano plot DEX vs unstim

head(res_DEX)

log2 fold change (MLE): Stimulation DEX vs unstim Wald test p-value: Stimulation DEX vs unstim DataFrame with 6 rows and 6 columns baseMean log2FoldChange lfcSE stat pvalue ENSG00000000003.14 21.0650 0.7463061 0.537690 1.387985 1.65142e-01 ENSG00000000419.12 239.5074 -0.6566940 0.291913 -2.249624 2.44728e-02 ENSG00000000457.14 94.7908 0.0320028 0.285518 0.112087 9.10755e-01 ENSG00000000460.17 28.9771 -0.0760268 0.312920 -0.242959 8.08037e-01 ENSG00000000938.13 566.1630 0.9289545 0.227658 4.080477 4.49434e-05 ENSG00000000971.15 53.8737 -1.2074592 0.505391 -2.389161 1.68869e-02 padj ENSG00000000003.14 0.46735489 ENSG00000000419.12 0.19812335 ENSG00000000457.14 0.96616666 ENSG00000000460.17 0.92241619 ENSG00000000938.13 0.00563962 ENSG00000000971.15 0.16718819

with(res_DEX, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-10,10)))
with(subset(res_DEX, padj<.05 ), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))

MA plot DEX vs unstim

plotMA(res_DEX, ylim=c(-2,2))

TOP differentially expressed genes DEX vs unstim

setDT(resOrdered_DEX, keep.rownames = "ensembl")
resOrdered_DEX <- left_join(resOrdered_DEX, gencode_30, by = "ensembl")
resOrdered_DEX_top = resOrdered_DEX[order(resOrdered_DEX$padj) ,]
setDT(resOrdered_DEX_top, keep.rownames = "ensembl")
resOrdered_DEX_top = resOrdered_DEX_top[, c("ensembl", "symbol", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
createDT(resOrdered_DEX_top)
#write.table(resOrdered_DEX_top, "DEG_DEX_GFM.txt")

DESeq2: Il4 vs unstim

Number of differentially expressed genes

res_IL4 <- results(dds, name="Stimulation_IL4_vs_unstim")
sum(res_IL4$padj < 0.05, na.rm=TRUE)

[1] 49

resOrdered_IL4 <- res_IL4[order(res_IL4$pvalue),] 
resOrdered_IL4 <- as.data.frame(resOrdered_IL4)

Volcano plot IL4 vs unstim

head(res_IL4)

log2 fold change (MLE): Stimulation IL4 vs unstim Wald test p-value: Stimulation IL4 vs unstim DataFrame with 6 rows and 6 columns baseMean log2FoldChange lfcSE stat pvalue ENSG00000000003.14 21.0650 0.7838708 0.571302 1.3720776 0.1700393 ENSG00000000419.12 239.5074 0.0516569 0.313110 0.1649799 0.8689598 ENSG00000000457.14 94.7908 -0.0275561 0.306218 -0.0899886 0.9282962 ENSG00000000460.17 28.9771 0.2019020 0.330779 0.6103841 0.5416074 ENSG00000000938.13 566.1630 -0.1234131 0.247244 -0.4991555 0.6176698 ENSG00000000971.15 53.8737 -1.0752739 0.553211 -1.9436962 0.0519321 padj ENSG00000000003.14 0.704114 ENSG00000000419.12 0.982346 ENSG00000000457.14 0.992126 ENSG00000000460.17 0.901563 ENSG00000000938.13 0.923007 ENSG00000000971.15 0.531886

with(res_IL4, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-10,10)))
with(subset(res_IL4, padj<.05 ), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))

MA plot IL4 vs unstim

plotMA(res_IL4, ylim=c(-2,2))

TOP differentially expressed genes IL4 vs unstim

setDT(resOrdered_IL4, keep.rownames = "ensembl")
resOrdered_IL4 <- left_join(resOrdered_IL4, gencode_30, by = "ensembl")
resOrdered_IL4_top = resOrdered_IL4[order(resOrdered_IL4$padj) ,]
setDT(resOrdered_IL4_top, keep.rownames = "ensembl")
resOrdered_IL4_top = resOrdered_IL4_top[, c("ensembl", "symbol", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
createDT(resOrdered_IL4_top)
#write.table(resOrdered_IL4_top, "DEG_IL4_GFM.txt")

DESeq2: ATP vs unstim

Number of differentially expressed genes

# generate results table for ATP vs unstim
res_ATP <- results(dds, name="Stimulation_ATP_vs_unstim")
sum(res_ATP$padj < 0.05, na.rm=TRUE)

[1] 134

resOrdered_ATP <- res_ATP[order(res_ATP$pvalue),] 
resOrdered_ATP <- as.data.frame(resOrdered_ATP)

Volcano plot ATP vs unstim

head(res_ATP)

log2 fold change (MLE): Stimulation ATP vs unstim Wald test p-value: Stimulation ATP vs unstim DataFrame with 6 rows and 6 columns baseMean log2FoldChange lfcSE stat pvalue ENSG00000000003.14 21.0650 -0.395191 0.866036 -0.456321 0.648159 ENSG00000000419.12 239.5074 -0.127991 0.457515 -0.279753 0.779667 ENSG00000000457.14 94.7908 0.113747 0.445230 0.255479 0.798353 ENSG00000000460.17 28.9771 0.159487 0.469636 0.339597 0.734160 ENSG00000000938.13 566.1630 0.333661 0.362592 0.920210 0.357463 ENSG00000000971.15 53.8737 -1.162431 0.837664 -1.387705 0.165227 padj ENSG00000000003.14 0.857835 ENSG00000000419.12 0.917424 ENSG00000000457.14 0.926040 ENSG00000000460.17 0.898640 ENSG00000000938.13 0.675375 ENSG00000000971.15 0.481472

with(res_ATP, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-10,10)))
with(subset(res_ATP, padj<.05 ), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))

MA plot ATP vs unstim

plotMA(res_ATP, ylim=c(-2,2))

TOP differentially expressed genes ATP vs unstim

setDT(resOrdered_ATP, keep.rownames = "ensembl")
resOrdered_ATP <- left_join(resOrdered_ATP, gencode_30, by = "ensembl")
resOrdered_ATP_top = resOrdered_ATP[order(resOrdered_ATP$padj) ,]
setDT(resOrdered_ATP_top, keep.rownames = "ensembl")
resOrdered_ATP_top = resOrdered_ATP_top[, c("ensembl", "symbol", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
createDT(resOrdered_ATP_top)
write.table(resOrdered_ATP_top, "DEG_ATP_GFM.txt")

#Create genelists with Log2FC < 1 and < -1 for different stimuli

resOrdered_LPS_p <- subset(resOrdered_LPS_top, padj < 0.05)
resOrdered_LPS_LFC <- subset(resOrdered_LPS_p, log2FoldChange > 1 | log2FoldChange < -1)
write.table(resOrdered_LPS_LFC, "LPS_GFM_FC1.txt")

resOrdered_IFNy_p <- subset(resOrdered_IFNy_top, padj < 0.05)
resOrdered_IFNy_LFC <- subset(resOrdered_IFNy_p, log2FoldChange > 1 | log2FoldChange < -1)
write.table(resOrdered_IFNy_LFC, "IFNy_GFM_FC1.txt")

resOrdered_R848_p <- subset(resOrdered_R848_top, padj < 0.05)
resOrdered_R848_LFC <- subset(resOrdered_R848_p, log2FoldChange > 1 | log2FoldChange < -1)
write.table(resOrdered_R848_LFC, "R848_GFM_FC1.txt")

resOrdered_DEX_p <- subset(resOrdered_DEX_top, padj < 0.05)
resOrdered_DEX_LFC <- subset(resOrdered_DEX_p, log2FoldChange > 1 | log2FoldChange < -1)
write.table(resOrdered_DEX_LFC, "DEX_GFM_FC1.txt")

resOrdered_IL4_p <- subset(resOrdered_IL4_top, padj < 0.05)
resOrdered_IL4_LFC <- subset(resOrdered_IL4_p, log2FoldChange > 1 | log2FoldChange < -1)
write.table(resOrdered_IL4_LFC, "IL4_GFM_FC1.txt")

resOrdered_ATP_p <- subset(resOrdered_ATP_top, padj < 0.05)
resOrdered_ATP_LFC <- subset(resOrdered_ATP_p, log2FoldChange > 1 | log2FoldChange < -1)
write.table(resOrdered_ATP_LFC, "ATP_GFM_FC1.txt")